#!/usr/bin/env python3

import requests
import time
from rdflib import BNode, Graph, Literal, Namespace, URIRef
from rdflib.namespace import RDF
from string import Template

dokkg = Graph ()
SCHEMA = Namespace ('http://schema.org/')
dokkg.bind ('schema', SCHEMA)

def query (query_string, limit=100000, offset=0):
    """Query the SPARQL endpoint."""
    
    return requests.post (
        'http://localhost:3030/dokk/sparql',
        timeout = 60,
        data = {
            'format': 'json',
            'query': Template (query_string).substitute ({ 'limit': limit, 'offset': offset })
        }).json ()

QUERY_MANPAGES = """
    PREFIX graph:         <graphdata:name:>
    PREFIX manpage-terms: <urn:manpage:terms:>
    PREFIX manpage:       <urn:manpage:>
    PREFIX schema:        <http://schema.org/>
    
    SELECT ?name ?description
    FROM graph:manpages
    WHERE
    {
        []  a schema:TextDigitalDocument ;
            schema:identifier ?name ;
            schema:disambiguatingDescription ?description .
    }
    LIMIT $limit
    OFFSET $offset
"""

# Some licenses have more than one name, so we select only one using MIN
# which selects the string with capital letters, eg. "BSD 2-Clause" instead
# of "BSD 2-clause"
QUERY_SPDX = """
    PREFIX graph:  <graphdata:name:>
    PREFIX schema: <http://schema.org/>
    PREFIX spdx:   <http://spdx.org/rdf/terms#>

    SELECT ?id (MIN (?_name) AS ?name)
    FROM graph:spdx
    WHERE
    {
        ?license a spdx:License ;
                   spdx:licenseId ?id ;
                   spdx:name ?_name .
        
        FILTER NOT EXISTS { ?license spdx:isDeprecatedLicenseId [] }
    }
    GROUP BY ?id
    LIMIT $limit
    OFFSET $offset
"""

limit = 50000
offset = 0
while True:
    data = query (QUERY_MANPAGES, limit=limit, offset=offset)
    
    if len (data['results']['bindings']) == 0:
        break
    
    data = data['results']['bindings']
    
    for item in data:
        thing = URIRef ('https://dokk.org/manpages/' + item['name']['value'])
        
        dokkg.add ((thing,
                   RDF.type,
                   SCHEMA.Thing))
        dokkg.add ((thing,
                   SCHEMA.name,
                   Literal (item['name']['value'])))
        dokkg.add ((thing,
                   SCHEMA.description,
                   Literal (item['description']['value'].capitalize ())))
        dokkg.add ((thing,
                   SCHEMA.url,
                   Literal ('/manpages/' + item['name']['value'])))
    
    offset += limit

limit = 50000
offset = 0
while True:
    data = query (QUERY_SPDX, limit=limit, offset=offset)
    
    if len (data['results']['bindings']) == 0:
        break
    
    data = data['results']['bindings']
    
    for item in data:
        thing = URIRef ('https://dokk.org/licenses/' + item['id']['value'])
        
        dokkg.add ((thing,
                   RDF.type,
                   SCHEMA.Thing))
        dokkg.add ((thing,
                   SCHEMA.name,
                   Literal (item['name']['value'])))
        dokkg.add ((thing,
                   SCHEMA.description,
                   Literal (item['id']['value'])))
        dokkg.add ((thing,
                   SCHEMA.url,
                   Literal ('/licenses/' + item['id']['value'])))
    
    offset += limit

# Write graph to file
dokkg.serialize ('topics.ttl', 'turtle')















